3 Cluster analysis

load("data/data.Rdata")

3.1 Cluster prevalence

Number of strategies each cluster was captured in.

cluster_prevalence <- cluster_counts %>%
  filter(read_count>0) %>%
  group_by(secondary_cluster) %>%
  summarise(n_strategy = n_distinct(overall_strategy)) %>%
  arrange(desc(n_strategy))

#Arranged by phylum and cluster prevalence
cluster_prevalence_phylum <- cluster_prevalence %>%
  left_join(cluster_taxonomy,by="secondary_cluster")

3.2 Cluster frequency

Number of clusters each strategy recovered.

cluster_number <- cluster_counts %>%
  filter(read_count>0) %>%
  group_by(overall_strategy) %>%
  summarise(n = n_distinct(secondary_cluster)) %>%
  arrange(desc(n)) %>%
  mutate(overall_strategy=factor(overall_strategy,levels=rev(overall_strategy)))

cluster_number %>%
  rename(Strategy=overall_strategy) %>%
  select(Strategy,n) %>%
  tt()
tinytable_8y6615bmby8mki40pcb5
Strategy n
coassembly_timepoint_all 176
coassembly_animal 163
coassembly_timepoint_cage 160
multicoverage_animal 130
multicoverage_timepoint_cage 126
single_coverage 117
multisplit_timepoint_all 115
multisplit_timepoint_cage 110
multisplit_animal 99
multicoverage_timepoint_all 93
cluster_number %>%
  filter(overall_strategy != "cobinning_treatment") %>%
  ggplot(aes(x = n, y = overall_strategy)) +
  geom_col(color="#666666") +
  coord_cartesian(xlim = c(90,200))+
  theme_classic() +
  theme() +
  labs(x="Number of clusters", y="Strategy")

3.3 Cluster heatmap

Overview of clusters per strategy.

cluster_counts %>%
  left_join(cluster_taxonomy,by="secondary_cluster") %>%
  mutate(Phylum=ifelse(is.na(Phylum),"Bacillota_A",Phylum)) %>%
  mutate(secondary_cluster=factor(secondary_cluster,levels=rev(cluster_tree$tip.label))) %>%
  mutate(overall_strategy=factor(overall_strategy,levels=rev(cluster_number$overall_strategy)))  %>%
  group_by(overall_strategy, secondary_cluster) %>%
  sample_n(1) %>%
  ungroup() %>%
  ggplot(aes(x = secondary_cluster, y = overall_strategy, fill=Phylum)) +
  geom_tile(color="#ffffff") +
  scale_fill_manual(values=phylum_colors) +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        axis.title.y = element_blank()) +
  xlab("MAG clusters")

3.4 Cluster abundance

Maximum read count

maximum_read_count <- cluster_counts %>%
  group_by(secondary_cluster) %>%
  slice_max(order_by = read_count) %>%
  ungroup()
maximum_read_count %>%
  mutate(secondary_cluster=factor(secondary_cluster,levels=rev(cluster_tree$tip.label))) %>%
  ggplot(aes(x = secondary_cluster, y = read_count)) +
  geom_col(color="#666666") +
  theme_classic() +
  theme(axis.text.x = element_blank()) +
  labs(x="MAG clusters", y="Number of reads")

3.5 MAG mapping

Average percentage of read mapped in each strategy against the respective catalogue.

all_counts <- read_delim("data/all_counts.tsv", col_names = c("sample", "total_count")) %>%
  mutate(total_count = total_count * 2)

mag_mapping <- bin_counts %>%
  left_join(all_counts, by = join_by(sample)) %>%
  summarise(mag_mapping = sum(read_count) / mean(total_count), .by = c(sample, overall_strategy)) %>%
  summarise(mag_mapping = mean(mag_mapping), .by = overall_strategy) %>%
  mutate(overall_strategy=factor(overall_strategy,levels=rev(cluster_number$overall_strategy)))

mag_mapping %>%
  filter(!is.na(overall_strategy)) %>%
  rename(Strategy=overall_strategy,`Mapping %`=mag_mapping) %>%
  mutate(`Mapping %`=`Mapping %`*100) %>%
  tt()
tinytable_28rgk6hrgmdeunlsya4i
Strategy Mapping %
single_coverage 80.25097
multicoverage_animal 79.78157
multicoverage_timepoint_all 79.68070
multicoverage_timepoint_cage 80.48701
coassembly_animal 80.91453
coassembly_timepoint_all 82.53138
coassembly_timepoint_cage 80.11122
multisplit_animal 68.33532
multisplit_timepoint_all 83.70626
multisplit_timepoint_cage 77.11125
mag_mapping %>%
  filter(overall_strategy != "cobinning_treatment") %>%
  ggplot(aes(x = mag_mapping, y = overall_strategy)) +
  geom_col(color="#666666") +
  coord_cartesian(xlim = c(0.6,0.9))+
  theme_classic() +
  theme() +
  labs(x="Mapping percentage", y="Strategy")

3.6 Domain-adjusted mapping rates (DAMR)

#import SingleM estimates
smf_files <- list.files(path = "data/singlem/", 
                        pattern = "*tsv.gz", 
                        full.names = T)

smf_estimates <- purrr::map(smf_files, read_delim) %>%
  bind_rows()

mean_smf <- smf_estimates %>%
  mutate(read_fraction = as.numeric(str_replace(read_fraction, "%", ""))) %>%
  pull(read_fraction) %>%
  mean()

mag_mapping %>%
  filter(!is.na(overall_strategy)) %>%
  rename(Strategy=overall_strategy) %>%
  mutate(mag_mapping = mag_mapping * 100,
         DAMR = (mag_mapping / mean_smf) * 100,
         DAMR = scales::number(if_else(DAMR > 100, 100, DAMR),
                               accuracy = 0.1)
         ) %>%
  select(-mag_mapping) %>%
  tt()
tinytable_xcjqja5nmjfh16c27hen
Strategy DAMR
single_coverage 100.0
multicoverage_animal 100.0
multicoverage_timepoint_all 100.0
multicoverage_timepoint_cage 100.0
coassembly_animal 100.0
coassembly_timepoint_all 100.0
coassembly_timepoint_cage 100.0
multisplit_animal 87.9
multisplit_timepoint_all 100.0
multisplit_timepoint_cage 99.2

3.7 Read counts vs prevalence

maximum_read_count %>%
  left_join(cluster_prevalence,by=join_by(secondary_cluster==secondary_cluster)) %>% 
  left_join(cluster_taxonomy,by="secondary_cluster") %>%
  select(read_count, n_strategy, Phylum) %>%
  ggplot(aes(x = read_count, y = n_strategy, group=n_strategy, color=Phylum)) +
      geom_boxplot(color="#999999", fill="#f4f4f4", outlier.shape = NA) +
      scale_y_continuous(breaks=seq(1,10,1))+
      scale_color_manual(values=phylum_colors) +
      geom_jitter() +
      theme_classic() +
      theme() +
      labs(x="Sequencing depth", y="Strategy prevalence")

3.8 Read abundances

3.8.1 Per animal

bin_counts_norm_filt %>%
    select(secondary_cluster,sample,read_count,overall_strategy) %>%
    group_by(secondary_cluster,overall_strategy,sample) %>%
    summarise(read_count=sum(read_count)) %>% 
    mutate(animal=substr(sample,1,5)) %>% 
    mutate(overall_strategy=factor(overall_strategy,levels=cluster_tree$tip.label)) %>% 
    ggplot(aes(y=sample,x=secondary_cluster, fill=read_count))+
      geom_tile()+
      scale_fill_distiller(palette = "GnBu", direction = -1) +
      facet_nested(animal ~ ., space="free", scales="free") +
      theme(axis.text.x=element_blank(),
            axis.text.y=element_blank(),
            strip.text.y = element_text(angle = 0)) +
      labs(y="Samples",x="Clusters",fill="Abundance")

3.8.2 Per timepoint

bin_counts_norm_filt %>%
    select(secondary_cluster,sample,read_count,overall_strategy) %>%
    group_by(secondary_cluster,overall_strategy,sample) %>%
    summarise(read_count=sum(read_count)) %>% 
    mutate(timepoint=factor(substr(sample,6,7),levels=c("OP","HT","HR","CT","CR")))%>%
    mutate(overall_strategy=factor(overall_strategy,levels=cluster_tree$tip.label)) %>% 
    ggplot(aes(y=sample,x=secondary_cluster, fill=read_count))+
      geom_tile()+
      scale_fill_distiller(palette = "GnBu", direction = -1) +
      facet_nested(timepoint ~ ., space="free", scales="free") +
      theme(axis.text.x=element_blank(),
            axis.text.y=element_blank(),
            strip.text.y = element_text(angle = 0)) +
      labs(y="Samples",x="Clusters",fill="Abundance")

3.8.3 Per timepoint + cage

bin_counts_norm_filt %>%
    select(secondary_cluster,sample,read_count,overall_strategy) %>%
    group_by(secondary_cluster,overall_strategy,sample) %>%
    summarise(read_count=sum(read_count)) %>% 
    mutate(timepoint=factor(substr(sample,6,7),levels=c("OP","HT","HR","CT","CR")))%>%
    mutate(cage=substr(sample,1,3)) %>% 
    mutate(overall_strategy=factor(overall_strategy,levels=cluster_tree$tip.label)) %>% 
    ggplot(aes(y=sample,x=secondary_cluster, fill=read_count))+
      geom_tile()+
      scale_fill_distiller(palette = "GnBu", direction = -1) +
      facet_nested(timepoint + cage ~ ., space="free", scales="free") +
      theme(axis.text.x=element_blank(),
            axis.text.y=element_blank(),
            strip.text.y = element_text(angle = 0)) +
      labs(y="Samples",x="Clusters",fill="Abundance")